In [2]:
from __future__ import print_function
from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)
In [4]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
remove=('headers', 'footers', 'quotes'),
categories=categories, shuffle=True, random_state=42)
twenty_train.target_names
Out[4]:
In [5]:
print(twenty_train.data[0])
print('Target: ', twenty_train.target[0])
In [6]:
# Text preprocessing, tokenizing and filtering of stopwords
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
max_features=5000,
stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape
Out[6]:
In [8]:
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_train_tfidf.shape
Out[8]:
In [ ]:
In [21]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_topics = 6
n_top_words = 20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
lda.fit(X_train_counts)
Out[21]:
In [22]:
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
In [23]:
X_trn_topics = lda.transform(X_train_counts)
X_trn_topics.shape
Out[23]:
In [ ]:
In [ ]:
from sklearn.neighbors import RadiusNeighborsClassifier
neigh = RadiusNeighborsClassifier(radius=1.0)
neigh.fit(X_trn_topics, twenty_train.target)
In [ ]:
In [ ]:
In [30]:
text_lda_knn = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
('lda', LatentDirichletAllocation(n_topics=200, max_iter=25,
learning_method='online',
learning_offset=200.,
random_state=0)),
('clf', RadiusNeighborsClassifier(radius=1.0)),
])
_ = text_lda_knn.fit(twenty_train.data, twenty_train.target)
predicted = text_lda_knn.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[30]:
In [ ]:
text_lda_sdg = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
('lda', LatentDirichletAllocation(n_topics=200, max_iter=25,
learning_method='online',
learning_offset=200.,
random_state=0)),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=25, random_state=42)),
])
_ = text_lda_sdg.fit(twenty_train.data, twenty_train.target)
predicted = text_lda_sdg.predict(docs_test)
np.mean(predicted == twenty_test.target)
In [ ]:
In [12]:
# Score 2 new docs
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = tf_vectorizer.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, twenty_train.target_names[category]))
In [19]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
target_names=twenty_test.target_names))
In [20]:
metrics.confusion_matrix(twenty_test.target, predicted)
Out[20]:
In [ ]:
In [ ]: